Load in libraries
library(dplyr)
library(ggplot2)
library(factoextra)
library(NbClust)
library(GGally)
library(plotly)
library(corrplot)
Load in Pareto table
pareto.table <- readr::read_csv("paretoTable.csv", col_names = TRUE)
Partition the dataset.
pareto.IO <- pareto.table%>% select(Ax, Bx, Dx,
Cx, Fx, Ex, Gx, Hx, Ix, Jx, Kx, Lx, Mx, Nx)
colnames(pareto.IO) <- c("Ax", "Bx", "Dx", "Cx", "Fx", "Ex", "Gx","Hx", "Ix", "Jx","Kx","Lx", "Mx", "Nx")
Standardize the variables
df <- as.data.frame(scale(pareto.IO), center = TRUE, scale = TRUE)
df_output_mean <- df%>%select("Gx","Hx", "Ix", "Jx")
df_output_sd <- df%>%select("Kx","Lx", "Mx", "Nx")
df_output <- df%>%select("Gx","Hx", "Ix", "Jx","Kx","Lx", "Mx", "Nx")
CLUSTERING BASED ON OUTPUT MEAN
Select optimum number of clusters
fviz_nbclust(NbClust(df_output_mean, distance = "euclidean", index = "all", method = "ward.D2"))

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 6 proposed 6 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 6 proposed 6 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .

Hierarchical clustering
d <- dist(df_output_mean, method = "euclidean")
link.ward.D2 <- hclust(d, method = "ward.D2")
Output boxplot by cluster
clust.3 <- cutree(link.ward.D2, k = 3)
ggplot(data = reshape2::melt(pareto.IO %>% select("Gx","Hx", "Ix", "Jx") %>% mutate(row.ID = 1:nrow(pareto.IO)),
id.vars = c("row.ID")) %>%
left_join(.,
data.frame(row.ID = 1:nrow(df),
cluster = clust.3),
by = c("row.ID"))) +
geom_boxplot( mapping = aes(x = cluster, y=value, fill = as.factor(cluster))) +
facet_wrap(~variable, scales = "free") +
scale_fill_discrete("Cluster")+theme_bw()

Matrix of scatterplots for visual inspection
matrix.df <-df_output_mean
matrix.df$cluster <- as.factor(clust.3)
ggpairs(matrix.df,columns = 1:4, bins=10, aes(colour = cluster))

Calculate the correlation matrix for output.
cor.matrix <- cor(df_output_mean)
corrplot::corrplot(cor.matrix, order = "hclust", addrect = 3)

Hierarchical clustering dendogram
fviz_dend(link.ward.D2, k = 3, rect = TRUE)

CLUSTERING BASED ON OUTPUT STDEV
Select optimum number of clusters
fviz_nbclust(NbClust(df_output_sd, distance = "euclidean", index = "all", method = "ward.D2"))

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 9 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 9 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 4 .

Hierarchical clustering
d <- dist(df_output_sd, method = "euclidean")
link.ward.D2 <- hclust(d, method = "ward.D2")
Output boxplot by cluster
clust.4 <- cutree(link.ward.D2, k = 4)
ggplot(data = reshape2::melt(pareto.IO %>% select("Kx","Lx", "Mx", "Nx") %>% mutate(row.ID = 1:nrow(pareto.IO)),
id.vars = c("row.ID")) %>%
left_join(.,
data.frame(row.ID = 1:nrow(df),
cluster = clust.4),
by = c("row.ID"))) +
geom_boxplot( mapping = aes(x = cluster, y=value, fill = as.factor(cluster))) +
facet_wrap(~variable, scales = "free") +
scale_fill_discrete("Cluster")+theme_bw()

Matrix of scatterplots for visual inspection
matrix.df <-df_output_sd
matrix.df$cluster <- as.factor(clust.4)
ggpairs(matrix.df,columns = 1:4, bins=10, aes(colour = cluster))

Calculate the correlation matrix for output.
cor.matrix <- cor(df_output_sd)
corrplot::corrplot(cor.matrix, order = "hclust", addrect = 4)

Hierarchical clustering dendogram
fviz_dend(link.ward.D2, k = 4, rect = TRUE)

CLUSTERING BASED ON BOTH OUTPUT SUMMARY STATISTICS
Select optimum number of clusters
fviz_nbclust(NbClust(df_output, distance = "euclidean", index = "all", method = "ward.D2"))

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .

Hierarchical clustering
d <- dist(df_output, method = "euclidean")
link.ward.D2 <- hclust(d, method = "ward.D2")
Output boxplot by cluster
clust.3 <- cutree(link.ward.D2, k = 3)
ggplot(data = reshape2::melt(pareto.IO %>% select("Gx","Hx", "Ix", "Jx","Kx","Lx", "Mx", "Nx") %>% mutate(row.ID = 1:nrow(pareto.IO)),
id.vars = c("row.ID")) %>%
left_join(.,
data.frame(row.ID = 1:nrow(df),
cluster = clust.3),
by = c("row.ID"))) +
geom_boxplot( mapping = aes(x = cluster, y=value, fill = as.factor(cluster))) +
facet_wrap(~variable, scales = "free") +
scale_fill_discrete("Cluster")+theme_bw()

Matrix of scatterplots for visual inspection
matrix.df <-df_output
matrix.df$cluster <- as.factor(clust.3)
ggpairs(matrix.df,columns = 1:8, bins=10, aes(colour = cluster))

Calculate the correlation matrix for output.
cor.matrix <- cor(df_output)
corrplot::corrplot(cor.matrix, order = "hclust", addrect = 3)

Hierarchical clustering dendogram
fviz_dend(link.ward.D2, k = 3, rect = TRUE)

CLUSTERING BASED ON BOTH OUTPUT SUMMARY STATISTICS, WITH K=4
Hierarchical clustering
d <- dist(df_output, method = "euclidean")
link.ward.D2 <- hclust(d, method = "ward.D2")
Output boxplot by cluster
ggplot(data = reshape2::melt(pareto.IO %>% select("Gx","Hx", "Ix", "Jx","Kx","Lx", "Mx", "Nx") %>% mutate(row.ID = 1:nrow(pareto.IO)),
id.vars = c("row.ID")) %>%
left_join(.,
data.frame(row.ID = 1:nrow(df),
cluster = clustAll.4),
by = c("row.ID"))) +
geom_boxplot( mapping = aes(x = cluster, y=value, fill = as.factor(cluster))) +
facet_wrap(~variable, scales = "free") +
scale_fill_discrete("Cluster")+theme_bw()

Matrix of scatterplots for visual inspection
matrix.df <-df_output
matrix.df$cluster <- as.factor(clustAll.4)
ggpairs(matrix.df,columns = 1:8, bins=10, aes(colour = cluster))

Calculate the correlation matrix for output.
cor.matrix <- cor(df_output)
corrplot::corrplot(cor.matrix, order = "hclust", addrect = 3)

Hierarchical clustering dendogram
fviz_dend(link.ward.D2, k = 4, rect = TRUE)

3D Plots to visualize Pareto Design
clustered.df <-pareto.IO
clustered.df$cluster <- as.factor(clust.3)
plot_ly(clustered.df, x = ~Bx, y = ~Ax, z =~Dx, marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Ax'),
yaxis = list(title = 'Bx'),
zaxis = list(title = 'Dx')),
annotations = list(
x = 1.13,
y = 1.05,
text = 'Cluster',
xref = 'paper',
yref = 'paper',
showarrow = FALSE
))
plot_ly(clustered.df, x = ~Cx, y = ~Fx, z =~Ex, marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Cx'),
yaxis = list(title = 'Fx'),
zaxis = list(title = 'Ex')),
annotations = list(
x = 1.13,
y = 1.05,
text = 'Cluster',
xref = 'paper',
yref = 'paper',
showarrow = FALSE
))
plot_ly(clustered.df, x = ~Hx, y = ~Ix, z =~Jx, marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Hx'),
yaxis = list(title = 'Ix'),
zaxis = list(title = 'Jx')),
annotations = list(
x = 1.13,
y = 1.05,
text = 'Cluster',
xref = 'paper',
yref = 'paper',
showarrow = FALSE
))